import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
data=pd.read_csv("C:/Users/Rakesh/Datasets/diamonds.csv")
data.head()
| Unnamed: 0 | carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 2 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 3 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 4 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 5 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
data.shape
(53940, 11)
data=data.drop("Unnamed: 0",axis=1)
data.head()
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
data.isnull().sum()
carat 0 cut 0 color 0 clarity 0 depth 0 table 0 price 0 x 0 y 0 z 0 dtype: int64
figure=px.scatter(data_frame=data,x='carat', y='price',size='depth', color='cut', trendline='ols')
figure.show()
data['size']=data['x']*data['y']*data['z']
data.head()
| carat | cut | color | clarity | depth | table | price | x | y | z | size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 | 38.202030 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 | 34.505856 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 | 38.076885 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 | 46.724580 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 | 51.917250 |
figure= px.scatter(data_frame=data, x='size', y='price', size='size', color='cut', trendline='ols')
figure.show()
There is a linear relationship between price and size
fig=px.box(data, x='cut',y='price', color='color')
fig.show()
fig=px.box(data_frame=data, x='cut',y='price', color='clarity')
fig.show()
correlation=data.corr()
print(correlation['price'].sort_values(ascending=False))
price 1.000000 carat 0.921591 size 0.902385 x 0.884435 y 0.865421 z 0.861249 table 0.127134 depth -0.010647 Name: price, dtype: float64
correlation
| carat | depth | table | price | x | y | z | size | |
|---|---|---|---|---|---|---|---|---|
| carat | 1.000000 | 0.028224 | 0.181618 | 0.921591 | 0.975094 | 0.951722 | 0.953387 | 0.976308 |
| depth | 0.028224 | 1.000000 | -0.295779 | -0.010647 | -0.025289 | -0.029341 | 0.094924 | 0.009157 |
| table | 0.181618 | -0.295779 | 1.000000 | 0.127134 | 0.195344 | 0.183760 | 0.150929 | 0.167400 |
| price | 0.921591 | -0.010647 | 0.127134 | 1.000000 | 0.884435 | 0.865421 | 0.861249 | 0.902385 |
| x | 0.975094 | -0.025289 | 0.195344 | 0.884435 | 1.000000 | 0.974701 | 0.970772 | 0.956564 |
| y | 0.951722 | -0.029341 | 0.183760 | 0.865421 | 0.974701 | 1.000000 | 0.952006 | 0.975143 |
| z | 0.953387 | 0.094924 | 0.150929 | 0.861249 | 0.970772 | 0.952006 | 1.000000 | 0.950065 |
| size | 0.976308 | 0.009157 | 0.167400 | 0.902385 | 0.956564 | 0.975143 | 0.950065 | 1.000000 |
print(correlation['carat'].sort_values(ascending=False))
carat 1.000000 size 0.976308 x 0.975094 z 0.953387 y 0.951722 price 0.921591 table 0.181618 depth 0.028224 Name: carat, dtype: float64
data['cut']= data['cut'].map({"Ideal":1, "Premium":2,"Good":3,"Very Good":4,"Fair":5})
Splitting the data in train and test
import sklearn
from sklearn.model_selection import train_test_split
data.head()
| carat | cut | color | clarity | depth | table | price | x | y | z | size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | 1 | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 | 38.202030 |
| 1 | 0.21 | 2 | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 | 34.505856 |
| 2 | 0.23 | 3 | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 | 38.076885 |
| 3 | 0.29 | 2 | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 | 46.724580 |
| 4 | 0.31 | 3 | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 | 51.917250 |
x=np.array(data[["carat","cut","size"]])
y=np.array(data[["price"]])
xtrain,xtest,ytrain,ytest= train_test_split(x,y, test_size=0.10, random_state=42)
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(xtrain,ytrain)
C:\Users\Rakesh\AppData\Local\Temp\ipykernel_928\1316745596.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
RandomForestRegressor()
print('Enter diamond details to predict price')
a=float(input("Carat Size: "))
b=int(input("Cut Type (Ideal:1,Premium:2,Good:3,Very Good:4, Fair:5)"))
c=float(input("Size: "))
features=np.array([[a,b,c]])
print("Predicted Diamond's price= ", model.predict(features))
Enter diamond details to predict price Carat Size: 0.60 Cut Type (Ideal:1,Premium:2,Good:3,Very Good:4, Fair:5)2 Size: 40 Predicted Diamond's price= [934.43833333]
So with the help of diamond details like carat, cut and size we can able to predict the price of the diamond